suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidytuesdayR))
suppressPackageStartupMessages(library(wesanderson))
suppressPackageStartupMessages(library(sjPlot))
suppressPackageStartupMessages(library(car))
suppressPackageStartupMessages(library(performance))
suppressPackageStartupMessages(library(robust))
theme_set(theme_minimal())
tuesdata <- tt_load('2021-04-20')
## ---- Compiling #TidyTuesday Information for 2021-04-20 ----
## --- There is 1 file available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 1: "netflix_titles.csv"
netflix <- tuesdata$netflix
netflix %>% glimpse()
## Rows: 7,787
## Columns: 12
## $ show_id <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
## $ type <chr> "TV Show", "Movie", "Movie", "Movie", "Movie", "TV Show",…
## $ title <chr> "3%", "7:19", "23:59", "9", "21", "46", "122", "187", "70…
## $ director <chr> NA, "Jorge Michel Grau", "Gilbert Chan", "Shane Acker", "…
## $ cast <chr> "João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Val…
## $ country <chr> "Brazil", "Mexico", "Singapore", "United States", "United…
## $ date_added <chr> "August 14, 2020", "December 23, 2016", "December 20, 201…
## $ release_year <dbl> 2020, 2016, 2011, 2009, 2008, 2016, 2019, 1997, 2019, 200…
## $ rating <chr> "TV-MA", "TV-MA", "R", "PG-13", "PG-13", "TV-MA", "TV-MA"…
## $ duration <chr> "4 Seasons", "93 min", "78 min", "80 min", "123 min", "1 …
## $ listed_in <chr> "International TV Shows, TV Dramas, TV Sci-Fi & Fantasy",…
## $ description <chr> "In a future where the elite inhabit an island paradise f…
netflix %>% count(type)
## # A tibble: 2 × 2
## type n
## <chr> <int>
## 1 Movie 5377
## 2 TV Show 2410
netflix %>% count(release_year)
## # A tibble: 73 × 2
## release_year n
## <dbl> <int>
## 1 1925 1
## 2 1942 2
## 3 1943 3
## 4 1944 3
## 5 1945 3
## 6 1946 2
## 7 1947 1
## 8 1954 2
## 9 1955 3
## 10 1956 2
## # ℹ 63 more rows
netflix %>% reframe(range_release_year = range(release_year)) # 1925-2021
## # A tibble: 2 × 1
## range_release_year
## <dbl>
## 1 1925
## 2 2021
netflix %>% count(country)
## # A tibble: 682 × 2
## country n
## <chr> <int>
## 1 Argentina 50
## 2 Argentina, Brazil, France, Poland, Germany, Denmark 1
## 3 Argentina, Chile 1
## 4 Argentina, Chile, Peru 1
## 5 Argentina, France 1
## 6 Argentina, France, United States, Germany, Qatar 1
## 7 Argentina, Italy 1
## 8 Argentina, Spain 8
## 9 Argentina, United States 1
## 10 Argentina, United States, Mexico 1
## # ℹ 672 more rows
netflix %>% count(duration)
## # A tibble: 216 × 2
## duration n
## <chr> <int>
## 1 1 Season 1608
## 2 10 Seasons 6
## 3 10 min 1
## 4 100 min 97
## 5 101 min 96
## 6 102 min 98
## 7 103 min 101
## 8 104 min 89
## 9 105 min 91
## 10 106 min 97
## # ℹ 206 more rows
netflix %>% count(date_added)
## # A tibble: 1,566 × 2
## date_added n
## <chr> <int>
## 1 " April 15, 2018" 1
## 2 " April 16, 2019" 1
## 3 " April 17, 2016" 1
## 4 " April 20, 2017" 1
## 5 " April 4, 2017" 1
## 6 " August 1, 2017" 1
## 7 " August 13, 2018" 1
## 8 " August 21, 2017" 1
## 9 " August 4, 2017" 3
## 10 " December 1, 2018" 1
## # ℹ 1,556 more rows
netflix_tidy <- netflix %>%
mutate(type = factor(type)) %>%
separate(duration, into = c("duration", "duration_units"), sep = " ", convert = TRUE) %>%
mutate(
duration_units = case_when(
duration_units == "min" ~ "minutes",
duration_units == "Season" ~ "seasons",
duration_units == "Seasons" ~ "seasons"
)
) %>%
mutate(duration_units = factor(duration_units)) %>%
mutate(date_added = mdy(date_added),
year_added = year(date_added)
)
netflix_tidy %>% count(duration)
## # A tibble: 206 × 2
## duration n
## <int> <int>
## 1 1 1608
## 2 2 382
## 3 3 185
## 4 4 87
## 5 5 59
## 6 6 30
## 7 7 19
## 8 8 19
## 9 9 9
## 10 10 7
## # ℹ 196 more rows
netflix_tidy %>% count(duration_units)
## # A tibble: 2 × 2
## duration_units n
## <fct> <int>
## 1 minutes 5377
## 2 seasons 2410
netflix_tidy %>% count(date_added)
## # A tibble: 1,513 × 2
## date_added n
## <date> <int>
## 1 2008-01-01 1
## 2 2008-02-04 1
## 3 2009-05-05 1
## 4 2009-11-18 1
## 5 2010-11-01 1
## 6 2011-05-17 1
## 7 2011-09-27 1
## 8 2011-10-01 11
## 9 2012-02-21 1
## 10 2012-11-14 1
## # ℹ 1,503 more rows
netflix_tidy %>% count(year_added)
## # A tibble: 15 × 2
## year_added n
## <dbl> <int>
## 1 2008 2
## 2 2009 2
## 3 2010 1
## 4 2011 13
## 5 2012 3
## 6 2013 11
## 7 2014 25
## 8 2015 88
## 9 2016 443
## 10 2017 1225
## 11 2018 1685
## 12 2019 2153
## 13 2020 2009
## 14 2021 117
## 15 NA 10
duplicate_show_ids <- netflix_tidy %>%
filter(duplicated(show_id) | duplicated(show_id, fromLast = TRUE))
# -> no duplicates found
anyNA(netflix_tidy)
## [1] TRUE
missing_summary <- netflix_tidy %>%
summarise(
country_missing = sum(is.na(country)),
director_missing = sum(is.na(director)),
cast_missing = sum(is.na(cast))
)
missing_summary
## # A tibble: 1 × 3
## country_missing director_missing cast_missing
## <int> <int> <int>
## 1 507 2389 718
missing_percent <- netflix_tidy %>%
summarise(
country_missing_pct = mean(is.na(country)) * 100,
director_missing_pct = mean(is.na(director)) * 100,
cast_missing_pct = mean(is.na(cast)) * 100
)
missing_percent
## # A tibble: 1 × 3
## country_missing_pct director_missing_pct cast_missing_pct
## <dbl> <dbl> <dbl>
## 1 6.51 30.7 9.22
netflix_tidy %>%
count(type) %>%
ggplot(aes("", n, fill = type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(title = "Distribution of content types on Netflix",
x = NULL, y = NULL,
fill = "Content type") +
theme_void()
netflix_tidy %>%
count(country, sort = TRUE) %>%
filter(n > 100) %>%
filter(!is.na(country)) %>%
ggplot(aes(fct_reorder(country, n), n)) +
geom_col() +
coord_flip() +
labs(title = "Content by country on Netflix",
x = "Country", y = "Number of movies/shows")
# density plot of release year by type
netflix_tidy %>%
ggplot(aes(release_year, fill = type)) +
geom_density(alpha = 0.5) +
labs(title = "Release year distribution by content type",
x = "Release year", y = "Density",
fill = "Content type") +
facet_wrap(~type, ncol = 1, scales = "free_y")
netflix_tidy %>%
filter(type == "Movie") %>%
filter(listed_in != "Movies") %>%
separate_rows(listed_in, sep = ", ") %>%
group_by(genre = factor(listed_in)) %>%
reframe(
n = n(),
mean_duration = mean(duration),
) %>%
ggplot(aes(mean_duration, reorder(genre, mean_duration), fill = reorder(genre, mean_duration))) +
geom_col() +
theme(legend.position = "none") +
labs(title = "Average duration of movies by genre",
x = "Average duration (minutes)", y = "Genre")
netflix_tidy %>%
filter(type == "Movie") %>%
separate_rows(listed_in, sep = ", ") %>%
count(genre = listed_in, rating) %>%
group_by(genre) %>%
mutate(percentage = n / sum(n) * 100) %>%
ungroup() %>%
ggplot(aes("", percentage, fill = rating)) +
geom_bar(stat = "identity", width = 1, color = "white", size = 0.25) +
coord_polar("y", start = 0) +
facet_wrap(~genre) +
labs(
title = "Ratings Distribution in Each Genre",
fill = "Rating"
) +
theme_void() +
theme(strip.text = element_text(size = 10, face = "bold"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
duration_trend <- netflix_tidy %>%
filter(!is.na(duration), !is.na(release_year), !is.na(type), !is.na(rating)) %>%
filter(type == "Movie")
duration_trend_model_complex <- lm(duration ~ release_year * rating, data = duration_trend)
summary(duration_trend_model_complex)
##
## Call:
## lm(formula = duration ~ release_year * rating, data = duration_trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.818 -13.785 -1.063 13.602 217.062
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.625e+03 4.570e+02 3.555 0.000381 ***
## release_year -7.690e-01 2.287e-01 -3.363 0.000777 ***
## ratingNC-17 3.060e+04 1.370e+04 2.233 0.025592 *
## ratingNR -1.571e+03 7.172e+02 -2.191 0.028521 *
## ratingPG -6.363e+02 5.461e+02 -1.165 0.244011
## ratingPG-13 -6.998e+02 5.407e+02 -1.294 0.195673
## ratingR -1.032e+03 4.989e+02 -2.068 0.038645 *
## ratingTV-14 -1.617e+02 4.754e+02 -0.340 0.733700
## ratingTV-G -4.972e+02 7.752e+02 -0.641 0.521315
## ratingTV-MA -1.134e+03 4.963e+02 -2.285 0.022342 *
## ratingTV-PG -3.934e+02 4.911e+02 -0.801 0.423087
## ratingTV-Y 1.841e+03 1.545e+03 1.192 0.233440
## ratingTV-Y7 -5.375e+03 1.465e+03 -3.668 0.000247 ***
## ratingTV-Y7-FV -7.601e+03 1.047e+04 -0.726 0.467824
## ratingUR -1.701e+03 1.626e+03 -1.046 0.295425
## release_year:ratingNC-17 -1.516e+01 6.800e+00 -2.229 0.025834 *
## release_year:ratingNR 7.898e-01 3.576e-01 2.209 0.027250 *
## release_year:ratingPG 3.260e-01 2.728e-01 1.195 0.232245
## release_year:ratingPG-13 3.623e-01 2.702e-01 1.341 0.180014
## release_year:ratingR 5.270e-01 2.494e-01 2.113 0.034617 *
## release_year:ratingTV-14 9.686e-02 2.378e-01 0.407 0.683737
## release_year:ratingTV-G 2.492e-01 3.857e-01 0.646 0.518191
## release_year:ratingTV-MA 5.729e-01 2.480e-01 2.310 0.020936 *
## release_year:ratingTV-PG 2.041e-01 2.455e-01 0.831 0.405876
## release_year:ratingTV-Y -9.260e-01 7.664e-01 -1.208 0.227014
## release_year:ratingTV-Y7 2.660e+00 7.276e-01 3.656 0.000259 ***
## release_year:ratingTV-Y7-FV 3.769e+00 5.196e+00 0.725 0.468236
## release_year:ratingUR 8.605e-01 8.120e-01 1.060 0.289346
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.43 on 5344 degrees of freedom
## Multiple R-squared: 0.2092, Adjusted R-squared: 0.2052
## F-statistic: 52.37 on 27 and 5344 DF, p-value: < 2.2e-16
tab_model(duration_trend_model_complex, show.aic = TRUE)
| Â | duration | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 1624.89 | 728.95 – 2520.82 | <0.001 |
| release year | -0.77 | -1.22 – -0.32 | 0.001 |
| ratingNC-17 | 30596.18 | 3734.59 – 57457.77 | 0.026 |
| rating [NR] | -1571.16 | -2977.20 – -165.12 | 0.029 |
| rating [PG] | -636.30 | -1706.91 – 434.30 | 0.244 |
| ratingPG-13 | -699.78 | -1759.84 – 360.27 | 0.196 |
| rating [R] | -1031.90 | -2009.90 – -53.90 | 0.039 |
| ratingTV-14 | -161.75 | -1093.77 – 770.27 | 0.734 |
| rating [TV-G] | -497.19 | -2016.93 – 1022.54 | 0.521 |
| rating [TV-MA] | -1134.11 | -2107.04 – -161.17 | 0.022 |
| rating [TV-PG] | -393.43 | -1356.17 – 569.31 | 0.423 |
| rating [TV-Y] | 1840.82 | -1187.48 – 4869.12 | 0.233 |
| rating [TV-Y7] | -5374.62 | -8246.97 – -2502.27 | <0.001 |
| rating [TV-Y7-FV] | -7601.49 | -28125.52 – 12922.54 | 0.468 |
| rating [UR] | -1701.18 | -4888.32 – 1485.95 | 0.295 |
| release_year:ratingNC-17 | -15.16 | -28.49 – -1.83 | 0.026 |
|
release year × rating [NR] |
0.79 | 0.09 – 1.49 | 0.027 |
|
release year × rating [PG] |
0.33 | -0.21 – 0.86 | 0.232 |
| release_year:ratingPG-13 | 0.36 | -0.17 – 0.89 | 0.180 |
| release year × rating [R] | 0.53 | 0.04 – 1.02 | 0.035 |
| release_year:ratingTV-14 | 0.10 | -0.37 – 0.56 | 0.684 |
|
release year × rating [TV-G] |
0.25 | -0.51 – 1.01 | 0.518 |
|
release year × rating [TV-MA] |
0.57 | 0.09 – 1.06 | 0.021 |
|
release year × rating [TV-PG] |
0.20 | -0.28 – 0.69 | 0.406 |
|
release year × rating [TV-Y] |
-0.93 | -2.43 – 0.58 | 0.227 |
|
release year × rating [TV-Y7] |
2.66 | 1.23 – 4.09 | <0.001 |
|
release year × rating [TV-Y7-FV] |
3.77 | -6.42 – 13.95 | 0.468 |
|
release year × rating [UR] |
0.86 | -0.73 – 2.45 | 0.289 |
| Observations | 5372 | ||
| R2 / R2 adjusted | 0.209 / 0.205 | ||
| AIC | 50041.481 | ||
duration_trend_model_simple <- lm(duration ~ release_year, data = duration_trend)
summary(duration_trend_model_simple)
##
## Call:
## lm(formula = duration ~ release_year, data = duration_trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -124.187 -13.468 -0.655 14.532 215.740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1315.74682 79.33185 16.59 <2e-16 ***
## release_year -0.60430 0.03941 -15.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.92 on 5370 degrees of freedom
## Multiple R-squared: 0.04195, Adjusted R-squared: 0.04177
## F-statistic: 235.1 on 1 and 5370 DF, p-value: < 2.2e-16
tab_model(duration_trend_model_simple, show.aic = TRUE)
| Â | duration | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 1315.75 | 1160.22 – 1471.27 | <0.001 |
| release year | -0.60 | -0.68 – -0.53 | <0.001 |
| Observations | 5372 | ||
| R2 / R2 adjusted | 0.042 / 0.042 | ||
| AIC | 51020.368 | ||
plot(duration_trend_model_complex, which = 4)
cooks_distance <- cooks.distance(duration_trend_model_complex)
influential_points <- which(cooks_distance > (4 / nrow(duration_trend)))
influential_points
## 119 121 128 159 176 182 297 337 359 414 415 466 477 518 558 559
## 119 121 128 159 176 182 297 337 359 414 415 466 477 518 558 559
## 561 562 573 597 628 691 694 704 717 742 828 846 850 859 878 882
## 561 562 573 597 628 691 694 704 717 742 828 846 850 859 878 882
## 903 936 951 955 968 1021 1069 1144 1147 1167 1198 1223 1260 1347 1385 1504
## 903 936 951 955 968 1021 1069 1144 1147 1167 1198 1223 1260 1347 1385 1504
## 1507 1508 1534 1556 1560 1617 1652 1653 1691 1715 1723 1739 1816 1912 1941 1980
## 1507 1508 1534 1556 1560 1617 1652 1653 1691 1715 1723 1739 1816 1912 1941 1980
## 1985 1986 2044 2055 2118 2167 2226 2250 2279 2381 2405 2435 2461 2467 2469 2472
## 1985 1986 2044 2055 2118 2167 2226 2250 2279 2381 2405 2435 2461 2467 2469 2472
## 2485 2495 2496 2497 2498 2508 2510 2511 2512 2514 2515 2518 2525 2563 2564 2566
## 2485 2495 2496 2497 2498 2508 2510 2511 2512 2514 2515 2518 2525 2563 2564 2566
## 2569 2583 2592 2602 2660 2688 2698 2700 2714 2746 2769 2841 2918 2925 2930 2946
## 2569 2583 2592 2602 2660 2688 2698 2700 2714 2746 2769 2841 2918 2925 2930 2946
## 2994 3082 3113 3117 3174 3219 3221 3225 3230 3346 3366 3408 3411 3415 3416 3432
## 2994 3082 3113 3117 3174 3219 3221 3225 3230 3346 3366 3408 3411 3415 3416 3432
## 3434 3436 3437 3438 3439 3442 3449 3451 3455 3470 3476 3520 3532 3548 3556 3666
## 3434 3436 3437 3438 3439 3442 3449 3451 3455 3470 3476 3520 3532 3548 3556 3666
## 3720 3724 3725 3735 3740 3780 3784 3851 3861 3886 3888 3899 3912 3922 3933 4012
## 3720 3724 3725 3735 3740 3780 3784 3851 3861 3886 3888 3899 3912 3922 3933 4012
## 4060 4075 4079 4102 4118 4121 4132 4133 4249 4265 4278 4284 4324 4344 4505 4527
## 4060 4075 4079 4102 4118 4121 4132 4133 4249 4265 4278 4284 4324 4344 4505 4527
## 4579 4580 4589 4600 4614 4635 4636 4637 4705 4747 4750 4761 4801 4854 4892 4903
## 4579 4580 4589 4600 4614 4635 4636 4637 4705 4747 4750 4761 4801 4854 4892 4903
## 4912 4913 4921 4938 4942 4999 5006 5010 5029 5075 5158 5174 5230 5246 5254 5259
## 4912 4913 4921 4938 4942 4999 5006 5010 5029 5075 5158 5174 5230 5246 5254 5259
## 5297 5299 5309 5340 5341 5353 5361
## 5297 5299 5309 5340 5341 5353 5361
# removing influential outliers
duration_trend_no_outliers <- duration_trend %>%
slice(-influential_points)
# re-running both models without influential outliers
duration_trend_model_complex_no_outliers <- lm(duration ~ release_year * rating, data = duration_trend_no_outliers)
summary(duration_trend_model_complex_no_outliers)
##
## Call:
## lm(formula = duration ~ release_year * rating, data = duration_trend_no_outliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -97.578 -12.733 -0.788 13.056 98.137
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 873.1014 673.7291 1.296 0.195060
## release_year -0.3946 0.3367 -1.172 0.241315
## ratingNR -838.9893 1011.2287 -0.830 0.406762
## ratingPG -257.2059 736.2109 -0.349 0.726830
## ratingPG-13 -376.7710 742.5183 -0.507 0.611880
## ratingR -212.0949 701.2285 -0.302 0.762312
## ratingTV-14 1465.8605 690.7539 2.122 0.033876 *
## ratingTV-G -3424.4951 1961.5906 -1.746 0.080910 .
## ratingTV-MA -352.6469 706.6159 -0.499 0.617755
## ratingTV-PG 1298.1721 720.8872 1.801 0.071793 .
## ratingTV-Y 8442.6682 2287.5927 3.691 0.000226 ***
## ratingTV-Y7 -5200.0283 2095.6263 -2.481 0.013120 *
## ratingUR 32.2530 23.5451 1.370 0.170796
## release_year:ratingNR 0.4242 0.5039 0.842 0.399918
## release_year:ratingPG 0.1365 0.3677 0.371 0.710403
## release_year:ratingPG-13 0.2010 0.3708 0.542 0.587808
## release_year:ratingR 0.1187 0.3503 0.339 0.734786
## release_year:ratingTV-14 -0.7125 0.3451 -2.064 0.039027 *
## release_year:ratingTV-G 1.6998 0.9735 1.746 0.080849 .
## release_year:ratingTV-MA 0.1836 0.3529 0.520 0.602861
## release_year:ratingTV-PG -0.6368 0.3600 -1.769 0.076982 .
## release_year:ratingTV-Y -4.2001 1.1345 -3.702 0.000216 ***
## release_year:ratingTV-Y7 2.5737 1.0402 2.474 0.013382 *
## release_year:ratingUR NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.98 on 5134 degrees of freedom
## Multiple R-squared: 0.2318, Adjusted R-squared: 0.2285
## F-statistic: 70.4 on 22 and 5134 DF, p-value: < 2.2e-16
tab_model(duration_trend_model_complex_no_outliers, show.aic = TRUE)
## Model matrix is rank deficient. Parameters `release_year:ratingUR` were
## not estimable.
| Â | duration | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 873.10 | -447.69 – 2193.90 | 0.195 |
| release year | -0.39 | -1.05 – 0.27 | 0.241 |
| rating [NR] | -838.99 | -2821.43 – 1143.45 | 0.407 |
| rating [PG] | -257.21 | -1700.49 – 1186.08 | 0.727 |
| ratingPG-13 | -376.77 | -1832.42 – 1078.88 | 0.612 |
| rating [R] | -212.09 | -1586.80 – 1162.61 | 0.762 |
| ratingTV-14 | 1465.86 | 111.69 – 2820.03 | 0.034 |
| rating [TV-G] | -3424.50 | -7270.05 – 421.06 | 0.081 |
| rating [TV-MA] | -352.65 | -1737.92 – 1032.62 | 0.618 |
| rating [TV-PG] | 1298.17 | -115.07 – 2711.42 | 0.072 |
| rating [TV-Y] | 8442.67 | 3958.01 – 12927.32 | <0.001 |
| rating [TV-Y7] | -5200.03 | -9308.35 – -1091.71 | 0.013 |
| rating [UR] | 32.25 | -13.91 – 78.41 | 0.171 |
|
release year × rating [NR] |
0.42 | -0.56 – 1.41 | 0.400 |
|
release year × rating [PG] |
0.14 | -0.58 – 0.86 | 0.710 |
| release_year:ratingPG-13 | 0.20 | -0.53 – 0.93 | 0.588 |
| release year × rating [R] | 0.12 | -0.57 – 0.81 | 0.735 |
| release_year:ratingTV-14 | -0.71 | -1.39 – -0.04 | 0.039 |
|
release year × rating [TV-G] |
1.70 | -0.21 – 3.61 | 0.081 |
|
release year × rating [TV-MA] |
0.18 | -0.51 – 0.88 | 0.603 |
|
release year × rating [TV-PG] |
-0.64 | -1.34 – 0.07 | 0.077 |
|
release year × rating [TV-Y] |
-4.20 | -6.42 – -1.98 | <0.001 |
|
release year × rating [TV-Y7] |
2.57 | 0.53 – 4.61 | 0.013 |
| Observations | 5157 | ||
| R2 / R2 adjusted | 0.232 / 0.228 | ||
| AIC | 46990.286 | ||
duration_trend_model_simple_no_outliers <- lm(duration ~ release_year, data = duration_trend_no_outliers)
summary(duration_trend_model_simple_no_outliers)
##
## Call:
## lm(formula = duration ~ release_year, data = duration_trend_no_outliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.495 -12.859 -0.859 13.929 110.048
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1686.30324 90.90469 18.55 <2e-16 ***
## release_year -0.78802 0.04514 -17.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.42 on 5155 degrees of freedom
## Multiple R-squared: 0.05581, Adjusted R-squared: 0.05563
## F-statistic: 304.7 on 1 and 5155 DF, p-value: < 2.2e-16
tab_model(duration_trend_model_simple_no_outliers, show.aic = TRUE)
| Â | duration | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 1686.30 | 1508.09 – 1864.52 | <0.001 |
| release year | -0.79 | -0.88 – -0.70 | <0.001 |
| Observations | 5157 | ||
| R2 / R2 adjusted | 0.056 / 0.056 | ||
| AIC | 48011.853 | ||
# The model matrix became rank deficient and some paramteres were not estimable. Therefore, I decided not to remove outliers from the model, as they likely represent valid aspects of the data.
qqnorm(residuals(duration_trend_model_complex))
qqline(residuals(duration_trend_model_complex), col = "red")
# looks fairly normal
# too many observations (>5000) to run a Shapiro-wilk test
qqnorm(residuals(duration_trend_model_simple))
qqline(residuals(duration_trend_model_simple), col = "red")
# looks fairly normal
# too many observations (>5000) to run a Shapiro-wilk test
# crPlots(duration_trend_model_complex) -> not available for interactions
crPlots(duration_trend_model_simple)
# looks farily linear
plot(fitted(duration_trend_model_complex), residuals(duration_trend_model_complex), main = "Residuals vs Fitted", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")
plot(duration_trend_model_complex, which = 3)
check_heteroskedasticity(duration_trend_model_complex)
## Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).
# the assumption of homoscedasticity does not seem to be met -> attemtping the log-transformation of the dependent variable
plot(fitted(duration_trend_model_simple), residuals(duration_trend_model_simple), main = "Residuals vs Fitted", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")
plot(duration_trend_model_simple, which = 3)
check_heteroskedasticity(duration_trend_model_simple)
## Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).
# the assumption of homoscedasticity does not seem to be met
duration_trend <- duration_trend %>%
mutate(duration_log = log(duration))
duration_trend_model_complex_log <- lm(duration_log ~ release_year * rating, data = duration_trend)
summary(duration_trend_model_complex_log)
##
## Call:
## lm(formula = duration_log ~ release_year * rating, data = duration_trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.01366 -0.11533 0.02341 0.16911 1.22178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.407e+01 5.517e+00 4.362 1.31e-05 ***
## release_year -9.832e-03 2.761e-03 -3.562 0.000372 ***
## ratingNC-17 2.467e+02 1.654e+02 1.491 0.135934
## ratingNR -1.947e+01 8.658e+00 -2.249 0.024539 *
## ratingPG -1.040e+01 6.592e+00 -1.578 0.114724
## ratingPG-13 -1.200e+01 6.528e+00 -1.838 0.066142 .
## ratingR -1.526e+01 6.022e+00 -2.535 0.011285 *
## ratingTV-14 -9.698e+00 5.739e+00 -1.690 0.091122 .
## ratingTV-G -1.382e+01 9.358e+00 -1.477 0.139699
## ratingTV-MA -1.789e+01 5.991e+00 -2.986 0.002835 **
## ratingTV-PG -8.547e+00 5.928e+00 -1.442 0.149424
## ratingTV-Y 4.135e+01 1.865e+01 2.217 0.026632 *
## ratingTV-Y7 -9.189e+01 1.769e+01 -5.196 2.12e-07 ***
## ratingTV-Y7-FV -1.154e+02 1.264e+02 -0.913 0.361359
## ratingUR -2.106e+01 1.963e+01 -1.073 0.283326
## release_year:ratingNC-17 -1.222e-01 8.209e-02 -1.488 0.136766
## release_year:ratingNR 9.806e-03 4.317e-03 2.272 0.023153 *
## release_year:ratingPG 5.306e-03 3.294e-03 1.611 0.107253
## release_year:ratingPG-13 6.145e-03 3.261e-03 1.884 0.059578 .
## release_year:ratingR 7.769e-03 3.010e-03 2.581 0.009889 **
## release_year:ratingTV-14 5.007e-03 2.870e-03 1.745 0.081108 .
## release_year:ratingTV-G 6.882e-03 4.656e-03 1.478 0.139467
## release_year:ratingTV-MA 9.013e-03 2.994e-03 3.010 0.002622 **
## release_year:ratingTV-PG 4.340e-03 2.964e-03 1.465 0.143104
## release_year:ratingTV-Y -2.076e-02 9.252e-03 -2.244 0.024846 *
## release_year:ratingTV-Y7 4.548e-02 8.783e-03 5.178 2.33e-07 ***
## release_year:ratingTV-Y7-FV 5.721e-02 6.272e-02 0.912 0.361751
## release_year:ratingUR 1.066e-02 9.802e-03 1.088 0.276809
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.307 on 5344 degrees of freedom
## Multiple R-squared: 0.2476, Adjusted R-squared: 0.2438
## F-statistic: 65.14 on 27 and 5344 DF, p-value: < 2.2e-16
tab_model(duration_trend_model_complex_log)
| Â | duration log | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 24.07 | 13.25 – 34.88 | <0.001 |
| release year | -0.01 | -0.02 – -0.00 | <0.001 |
| ratingNC-17 | 246.67 | -77.59 – 570.94 | 0.136 |
| rating [NR] | -19.47 | -36.45 – -2.50 | 0.025 |
| rating [PG] | -10.40 | -23.32 – 2.52 | 0.115 |
| ratingPG-13 | -12.00 | -24.79 – 0.80 | 0.066 |
| rating [R] | -15.26 | -27.07 – -3.46 | 0.011 |
| ratingTV-14 | -9.70 | -20.95 – 1.55 | 0.091 |
| rating [TV-G] | -13.82 | -32.17 – 4.52 | 0.140 |
| rating [TV-MA] | -17.89 | -29.64 – -6.15 | 0.003 |
| rating [TV-PG] | -8.55 | -20.17 – 3.07 | 0.149 |
| rating [TV-Y] | 41.35 | 4.79 – 77.91 | 0.027 |
| rating [TV-Y7] | -91.89 | -126.57 – -57.22 | <0.001 |
| rating [TV-Y7-FV] | -115.37 | -363.13 – 132.39 | 0.361 |
| rating [UR] | -21.06 | -59.53 – 17.42 | 0.283 |
| release_year:ratingNC-17 | -0.12 | -0.28 – 0.04 | 0.137 |
|
release year × rating [NR] |
0.01 | 0.00 – 0.02 | 0.023 |
|
release year × rating [PG] |
0.01 | -0.00 – 0.01 | 0.107 |
| release_year:ratingPG-13 | 0.01 | -0.00 – 0.01 | 0.060 |
| release year × rating [R] | 0.01 | 0.00 – 0.01 | 0.010 |
| release_year:ratingTV-14 | 0.01 | -0.00 – 0.01 | 0.081 |
|
release year × rating [TV-G] |
0.01 | -0.00 – 0.02 | 0.139 |
|
release year × rating [TV-MA] |
0.01 | 0.00 – 0.01 | 0.003 |
|
release year × rating [TV-PG] |
0.00 | -0.00 – 0.01 | 0.143 |
|
release year × rating [TV-Y] |
-0.02 | -0.04 – -0.00 | 0.025 |
|
release year × rating [TV-Y7] |
0.05 | 0.03 – 0.06 | <0.001 |
|
release year × rating [TV-Y7-FV] |
0.06 | -0.07 – 0.18 | 0.362 |
|
release year × rating [UR] |
0.01 | -0.01 – 0.03 | 0.277 |
| Observations | 5372 | ||
| R2 / R2 adjusted | 0.248 / 0.244 | ||
plot(fitted(duration_trend_model_complex_log), residuals(duration_trend_model_complex_log), main = "Residuals vs Fitted", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")
check_heteroskedasticity(duration_trend_model_complex_log)
## Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).
# the assumption of homoscedasticity still does not seem to be met
# I tried running a robust regression with the "robust" package, but ran into multiple errors, so I will continue with the model as is while keeping in mind its limitations.
# I will skip this step since there is only one continuous predictor in the model.
anova(duration_trend_model_simple, duration_trend_model_complex)
## Analysis of Variance Table
##
## Model 1: duration ~ release_year
## Model 2: duration ~ release_year * rating
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 5370 4186702
## 2 5344 3455660 26 731043 43.481 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The likelihood ratio test comparing the two models indicated a significant improvement in fit for the complex model compared to the simple model (F(26) = 43.481, p < 0.001)
summary(duration_trend_model_simple)$adj.r.squared # 0.041
## [1] 0.04176801
summary(duration_trend_model_complex)$adj.r.squared # 0.205
## [1] 0.2052375
# The complex model explains significantly more variance than the simple model.
# AIC comparison
AIC(duration_trend_model_simple, duration_trend_model_complex)
## df AIC
## duration_trend_model_simple 3 51020.37
## duration_trend_model_complex 29 50041.48
# The more complex model has a lower AIC value (rule of thumb: if a model is 2 AIC units lower than the other, it is considered significantly better).
duration_trend %>%
ggplot(aes(release_year, duration)) +
geom_jitter(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Duration of movies over time by rating",
x = "Release year", y = "Duration (minutes)")
## `geom_smooth()` using formula = 'y ~ x'
duration_trend %>%
ggplot(aes(release_year, duration, color = rating)) +
geom_jitter(alpha = 0.2) +
geom_smooth(method = "lm", lwd = 1.5, se = FALSE) +
labs(title = "Duration of movies over time by rating",
x = "Release year", y = "Duration (minutes)",
color = "Rating") +
scale_color_viridis_d()
## `geom_smooth()` using formula = 'y ~ x'